# ==============================================================================
# name: tweets-sentiment.R
# author: Bernhard Clemm
# ==============================================================================

dir <- dirname(dirname(rstudioapi::getSourceEditorContext()$path))
source(paste0(dir, "/code/setup-packages.R"))

# Pre-processing function ======================================================

preprocess <- function(dt) {
  dt_cleaned <- dt %>%
    mutate(text_clean = tolower(text)) %>%
    mutate(text_clean = gsub("http.*", "", .$text_clean)) %>%
    mutate(text_clean = gsub("https.*", "", .$text_clean)) %>%
    mutate(text_clean = gsub("&amp", "", .$text_clean)) %>%
    mutate(text_clean = gsub("@\\w*", "", .$text_clean)) %>%
    mutate(text_clean = gsub("[[:punct:]]", "", .$text_clean)) %>%
    mutate(text_clean = gsub("^RT |^rt", "", .$text_clean)) %>%
    mutate(text_clean = str_trim(text_clean, side = c("both")))

  dt_unnested <- dt_cleaned %>%
    select(id, text_clean) %>%
    unnest_tokens(word, text_clean) %>%
    anti_join(data.frame(word = stopwords(language = "de"), "dass"))
}

# Sentiment via SentiWS ========================================================
# Source: https://wortschatz.uni-leipzig.de/en/download

# Read sentiment scores

read_senti_scores <- function(filename) {
  results <- read.delim(filename, header = FALSE, encoding = "UTF-8") %>%
    cbind(str_split_fixed(.$V3, "[,-]", 50), stringsAsFactors = FALSE) %>%
    mutate(
      V1 = str_sub(str_match(V1, ".*\\|"), 1, -2),
      nr = row_number()
    ) %>%
    select(-V3) %>%
    mutate(nr = as.character(nr)) %>%
    gather(wordstem, word, V1, 1:48, -nr, -V2) %>%
    select(word, V2) %>%
    rename(score = V2) %>%
    filter(word != "")
}

positive <- read_senti_scores(
  paste0(dir, "/data/twitter/SentiWS_v2.0/SentiWS_v2.0_Positive.txt")
)
negative <- read_senti_scores(
  paste0(dir, "/data/twitter/SentiWS_v2.0/SentiWS_v2.0_Negative.txt")
)
sentis <- bind_rows(positive, negative)

# Summarize wolf tweets sentiment ==============================================

tweets_wolf <- read.csv(paste0(dir, "/data/twitter/tweets_wolf.csv"))

# Pre-process
tweets_wolf_unnested <- preprocess(tweets_wolf)

# Join sentiments
tweets_wolf_unnested <- tweets_wolf_unnested %>%
  left_join(., sentis, by = "word") %>%
  mutate(score = ifelse(is.na(score), 0, score))

# Compute tweet average
tweets_wolf_sentiws <- tweets_wolf_unnested %>%
  group_by(id) %>%
  summarize(sentiws = mean(score, na.rm = T))

# Standardize tweet with mean and SD of full tweet dataset
## As we cannot share the full data set, we manually input these stats
tweets_all_mean <- 0.005463373
tweets_all_sd <- 0.04515948
tweets_wolf_sentiws <- tweets_wolf_sentiws %>%
  mutate(sentiws_stand = (sentiws - tweets_all_mean) / tweets_all_sd)

# Compute party average
tweets_wolf <- left_join(
  tweets_wolf, tweets_wolf_sentiws,
  by = "id"
)

sentiment_by_party <- tweets_wolf %>%
  as.data.frame() %>%
  group_by(party) %>%
  get_summary_stats(sentiws_stand)

# Analysis =====================================================================

tweets_wolf <- tweets_wolf %>%
  mutate(afd = factor(ifelse(party == "AfD", "AfD", "Others")))

t.test(sentiws_stand ~ afd, data = tweets_wolf)
